We will be exploring the dataset from https://ourworldindata.org/coronavirus
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px
import plotly.offline as py
%matplotlib inline
url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv"
df = pd.read_csv(url)
df
| iso_code | continent | location | date | total_cases | new_cases | new_cases_smoothed | total_deaths | new_deaths | new_deaths_smoothed | ... | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | population | excess_mortality_cumulative_absolute | excess_mortality_cumulative | excess_mortality | excess_mortality_cumulative_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2020-01-03 | NaN | 0.0 | NaN | NaN | 0.0 | NaN | ... | NaN | 37.746 | 0.5 | 64.83 | 0.511 | 41128772.0 | NaN | NaN | NaN | NaN |
| 1 | AFG | Asia | Afghanistan | 2020-01-04 | NaN | 0.0 | NaN | NaN | 0.0 | NaN | ... | NaN | 37.746 | 0.5 | 64.83 | 0.511 | 41128772.0 | NaN | NaN | NaN | NaN |
| 2 | AFG | Asia | Afghanistan | 2020-01-05 | NaN | 0.0 | NaN | NaN | 0.0 | NaN | ... | NaN | 37.746 | 0.5 | 64.83 | 0.511 | 41128772.0 | NaN | NaN | NaN | NaN |
| 3 | AFG | Asia | Afghanistan | 2020-01-06 | NaN | 0.0 | NaN | NaN | 0.0 | NaN | ... | NaN | 37.746 | 0.5 | 64.83 | 0.511 | 41128772.0 | NaN | NaN | NaN | NaN |
| 4 | AFG | Asia | Afghanistan | 2020-01-07 | NaN | 0.0 | NaN | NaN | 0.0 | NaN | ... | NaN | 37.746 | 0.5 | 64.83 | 0.511 | 41128772.0 | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 323695 | ZWE | Africa | Zimbabwe | 2023-07-01 | 265524.0 | 0.0 | 15.857 | 5707.0 | 0.0 | 0.000 | ... | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
| 323696 | ZWE | Africa | Zimbabwe | 2023-07-02 | 265524.0 | 0.0 | 15.857 | 5707.0 | 0.0 | 0.000 | ... | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
| 323697 | ZWE | Africa | Zimbabwe | 2023-07-03 | 265604.0 | 80.0 | 11.429 | 5709.0 | 2.0 | 0.286 | ... | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
| 323698 | ZWE | Africa | Zimbabwe | 2023-07-04 | 265604.0 | 0.0 | 11.429 | 5709.0 | 0.0 | 0.286 | ... | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
| 323699 | ZWE | Africa | Zimbabwe | 2023-07-05 | 265604.0 | 0.0 | 11.429 | 5709.0 | 0.0 | 0.286 | ... | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
323700 rows × 67 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 323700 entries, 0 to 323699 Data columns (total 67 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 iso_code 323700 non-null object 1 continent 308332 non-null object 2 location 323700 non-null object 3 date 323700 non-null object 4 total_cases 286549 non-null float64 5 new_cases 314782 non-null float64 6 new_cases_smoothed 313523 non-null float64 7 total_deaths 265511 non-null float64 8 new_deaths 314824 non-null float64 9 new_deaths_smoothed 313594 non-null float64 10 total_cases_per_million 286549 non-null float64 11 new_cases_per_million 314782 non-null float64 12 new_cases_smoothed_per_million 313523 non-null float64 13 total_deaths_per_million 265511 non-null float64 14 new_deaths_per_million 314824 non-null float64 15 new_deaths_smoothed_per_million 313594 non-null float64 16 reproduction_rate 184817 non-null float64 17 icu_patients 36791 non-null float64 18 icu_patients_per_million 36791 non-null float64 19 hosp_patients 37673 non-null float64 20 hosp_patients_per_million 37673 non-null float64 21 weekly_icu_admissions 9721 non-null float64 22 weekly_icu_admissions_per_million 9721 non-null float64 23 weekly_hosp_admissions 22425 non-null float64 24 weekly_hosp_admissions_per_million 22425 non-null float64 25 total_tests 79387 non-null float64 26 new_tests 75403 non-null float64 27 total_tests_per_thousand 79387 non-null float64 28 new_tests_per_thousand 75403 non-null float64 29 new_tests_smoothed 103965 non-null float64 30 new_tests_smoothed_per_thousand 103965 non-null float64 31 positive_rate 95927 non-null float64 32 tests_per_case 94348 non-null float64 33 tests_units 106788 non-null object 34 total_vaccinations 76588 non-null float64 35 people_vaccinated 73341 non-null float64 36 people_fully_vaccinated 69866 non-null float64 37 total_boosters 45062 non-null float64 38 new_vaccinations 63042 non-null float64 39 new_vaccinations_smoothed 172904 non-null float64 40 total_vaccinations_per_hundred 76588 non-null float64 41 people_vaccinated_per_hundred 73341 non-null float64 42 people_fully_vaccinated_per_hundred 69866 non-null float64 43 total_boosters_per_hundred 45062 non-null float64 44 new_vaccinations_smoothed_per_million 172904 non-null float64 45 new_people_vaccinated_smoothed 172695 non-null float64 46 new_people_vaccinated_smoothed_per_hundred 172695 non-null float64 47 stringency_index 197651 non-null float64 48 population_density 274685 non-null float64 49 median_age 255470 non-null float64 50 aged_65_older 246525 non-null float64 51 aged_70_older 252910 non-null float64 52 gdp_per_capita 250365 non-null float64 53 extreme_poverty 161293 non-null float64 54 cardiovasc_death_rate 250879 non-null float64 55 diabetes_prevalence 263650 non-null float64 56 female_smokers 188175 non-null float64 57 male_smokers 185615 non-null float64 58 handwashing_facilities 122886 non-null float64 59 hospital_beds_per_thousand 221455 non-null float64 60 life_expectancy 297710 non-null float64 61 human_development_index 243170 non-null float64 62 population 323700 non-null float64 63 excess_mortality_cumulative_absolute 11245 non-null float64 64 excess_mortality_cumulative 11245 non-null float64 65 excess_mortality 11245 non-null float64 66 excess_mortality_cumulative_per_million 11245 non-null float64 dtypes: float64(62), object(5) memory usage: 165.5+ MB
# Use list comprehension to go through every column in df and find the average number of missing values as a percentage
NAN = [(c, df[c].isna().mean()*100) for c in df]
# Convert this list to another dataframe and name columns appropriately
NAN = pd.DataFrame(NAN, columns=["Column Name", "Percentage"])
pd.options.display.max_rows = 67 # So we can see all the rows in NAN
NAN
| Column Name | Percentage | |
|---|---|---|
| 0 | iso_code | 0.000000 |
| 1 | continent | 4.747606 |
| 2 | location | 0.000000 |
| 3 | date | 0.000000 |
| 4 | total_cases | 11.476985 |
| 5 | new_cases | 2.755020 |
| 6 | new_cases_smoothed | 3.143960 |
| 7 | total_deaths | 17.976213 |
| 8 | new_deaths | 2.742045 |
| 9 | new_deaths_smoothed | 3.122027 |
| 10 | total_cases_per_million | 11.476985 |
| 11 | new_cases_per_million | 2.755020 |
| 12 | new_cases_smoothed_per_million | 3.143960 |
| 13 | total_deaths_per_million | 17.976213 |
| 14 | new_deaths_per_million | 2.742045 |
| 15 | new_deaths_smoothed_per_million | 3.122027 |
| 16 | reproduction_rate | 42.904850 |
| 17 | icu_patients | 88.634229 |
| 18 | icu_patients_per_million | 88.634229 |
| 19 | hosp_patients | 88.361755 |
| 20 | hosp_patients_per_million | 88.361755 |
| 21 | weekly_icu_admissions | 96.996911 |
| 22 | weekly_icu_admissions_per_million | 96.996911 |
| 23 | weekly_hosp_admissions | 93.072289 |
| 24 | weekly_hosp_admissions_per_million | 93.072289 |
| 25 | total_tests | 75.475131 |
| 26 | new_tests | 76.705901 |
| 27 | total_tests_per_thousand | 75.475131 |
| 28 | new_tests_per_thousand | 76.705901 |
| 29 | new_tests_smoothed | 67.882298 |
| 30 | new_tests_smoothed_per_thousand | 67.882298 |
| 31 | positive_rate | 70.365462 |
| 32 | tests_per_case | 70.853259 |
| 33 | tests_units | 67.010195 |
| 34 | total_vaccinations | 76.339821 |
| 35 | people_vaccinated | 77.342910 |
| 36 | people_fully_vaccinated | 78.416435 |
| 37 | total_boosters | 86.079086 |
| 38 | new_vaccinations | 80.524560 |
| 39 | new_vaccinations_smoothed | 46.585110 |
| 40 | total_vaccinations_per_hundred | 76.339821 |
| 41 | people_vaccinated_per_hundred | 77.342910 |
| 42 | people_fully_vaccinated_per_hundred | 78.416435 |
| 43 | total_boosters_per_hundred | 86.079086 |
| 44 | new_vaccinations_smoothed_per_million | 46.585110 |
| 45 | new_people_vaccinated_smoothed | 46.649676 |
| 46 | new_people_vaccinated_smoothed_per_hundred | 46.649676 |
| 47 | stringency_index | 38.940068 |
| 48 | population_density | 15.142107 |
| 49 | median_age | 21.078159 |
| 50 | aged_65_older | 23.841520 |
| 51 | aged_70_older | 21.869015 |
| 52 | gdp_per_capita | 22.655236 |
| 53 | extreme_poverty | 50.172073 |
| 54 | cardiovasc_death_rate | 22.496447 |
| 55 | diabetes_prevalence | 18.551128 |
| 56 | female_smokers | 41.867470 |
| 57 | male_smokers | 42.658326 |
| 58 | handwashing_facilities | 62.037071 |
| 59 | hospital_beds_per_thousand | 31.586345 |
| 60 | life_expectancy | 8.029039 |
| 61 | human_development_index | 24.877973 |
| 62 | population | 0.000000 |
| 63 | excess_mortality_cumulative_absolute | 96.526104 |
| 64 | excess_mortality_cumulative | 96.526104 |
| 65 | excess_mortality | 96.526104 |
| 66 | excess_mortality_cumulative_per_million | 96.526104 |
print(df["location"].unique())
print(len(df["location"].unique()))
['Afghanistan' 'Africa' 'Albania' 'Algeria' 'American Samoa' 'Andorra' 'Angola' 'Anguilla' 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Aruba' 'Asia' 'Australia' 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan' 'Bolivia' 'Bonaire Sint Eustatius and Saba' 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'British Virgin Islands' 'Brunei' 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cambodia' 'Cameroon' 'Canada' 'Cape Verde' 'Cayman Islands' 'Central African Republic' 'Chad' 'Chile' 'China' 'Colombia' 'Comoros' 'Congo' 'Cook Islands' 'Costa Rica' "Cote d'Ivoire" 'Croatia' 'Cuba' 'Curacao' 'Cyprus' 'Czechia' 'Democratic Republic of Congo' 'Denmark' 'Djibouti' 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'England' 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia' 'Europe' 'European Union' 'Faeroe Islands' 'Falkland Islands' 'Fiji' 'Finland' 'France' 'French Guiana' 'French Polynesia' 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Gibraltar' 'Greece' 'Greenland' 'Grenada' 'Guadeloupe' 'Guam' 'Guatemala' 'Guernsey' 'Guinea' 'Guinea-Bissau' 'Guyana' 'Haiti' 'High income' 'Honduras' 'Hong Kong' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran' 'Iraq' 'Ireland' 'Isle of Man' 'Israel' 'Italy' 'Jamaica' 'Japan' 'Jersey' 'Jordan' 'Kazakhstan' 'Kenya' 'Kiribati' 'Kosovo' 'Kuwait' 'Kyrgyzstan' 'Laos' 'Latvia' 'Lebanon' 'Lesotho' 'Liberia' 'Libya' 'Liechtenstein' 'Lithuania' 'Low income' 'Lower middle income' 'Luxembourg' 'Macao' 'Madagascar' 'Malawi' 'Malaysia' 'Maldives' 'Mali' 'Malta' 'Marshall Islands' 'Martinique' 'Mauritania' 'Mauritius' 'Mayotte' 'Mexico' 'Micronesia (country)' 'Moldova' 'Monaco' 'Mongolia' 'Montenegro' 'Montserrat' 'Morocco' 'Mozambique' 'Myanmar' 'Namibia' 'Nauru' 'Nepal' 'Netherlands' 'New Caledonia' 'New Zealand' 'Nicaragua' 'Niger' 'Nigeria' 'Niue' 'North America' 'North Korea' 'North Macedonia' 'Northern Cyprus' 'Northern Ireland' 'Northern Mariana Islands' 'Norway' 'Oceania' 'Oman' 'Pakistan' 'Palau' 'Palestine' 'Panama' 'Papua New Guinea' 'Paraguay' 'Peru' 'Philippines' 'Pitcairn' 'Poland' 'Portugal' 'Puerto Rico' 'Qatar' 'Reunion' 'Romania' 'Russia' 'Rwanda' 'Saint Barthelemy' 'Saint Helena' 'Saint Kitts and Nevis' 'Saint Lucia' 'Saint Martin (French part)' 'Saint Pierre and Miquelon' 'Saint Vincent and the Grenadines' 'Samoa' 'San Marino' 'Sao Tome and Principe' 'Saudi Arabia' 'Scotland' 'Senegal' 'Serbia' 'Seychelles' 'Sierra Leone' 'Singapore' 'Sint Maarten (Dutch part)' 'Slovakia' 'Slovenia' 'Solomon Islands' 'Somalia' 'South Africa' 'South America' 'South Korea' 'South Sudan' 'Spain' 'Sri Lanka' 'Sudan' 'Suriname' 'Sweden' 'Switzerland' 'Syria' 'Taiwan' 'Tajikistan' 'Tanzania' 'Thailand' 'Timor' 'Togo' 'Tokelau' 'Tonga' 'Trinidad and Tobago' 'Tunisia' 'Turkey' 'Turkmenistan' 'Turks and Caicos Islands' 'Tuvalu' 'Uganda' 'Ukraine' 'United Arab Emirates' 'United Kingdom' 'United States' 'United States Virgin Islands' 'Upper middle income' 'Uruguay' 'Uzbekistan' 'Vanuatu' 'Vatican' 'Venezuela' 'Vietnam' 'Wales' 'Wallis and Futuna' 'Western Sahara' 'World' 'Yemen' 'Zambia' 'Zimbabwe'] 255
# Delete World and Continent Summary Figures
df.drop(df[df["continent"].isna()].index, inplace=True)
# Get most recent date
df_most_recent = df[df["date"]==max(df["date"])].reset_index()
df_most_recent
| index | iso_code | continent | location | date | total_cases | new_cases | new_cases_smoothed | total_deaths | new_deaths | ... | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | population | excess_mortality_cumulative_absolute | excess_mortality_cumulative | excess_mortality | excess_mortality_cumulative_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24323 | BGD | Asia | Bangladesh | 2023-07-06 | NaN | NaN | NaN | NaN | NaN | ... | 44.7 | 34.808 | 0.800 | 72.59 | 0.632 | 1.711864e+08 | NaN | NaN | NaN | NaN |
| 1 | 43524 | BGR | Europe | Bulgaria | 2023-07-06 | NaN | NaN | NaN | NaN | NaN | ... | 44.4 | NaN | 7.454 | 75.05 | 0.816 | 6.781955e+06 | NaN | NaN | NaN | NaN |
| 2 | 71685 | CZE | Europe | Czechia | 2023-07-06 | NaN | NaN | NaN | NaN | NaN | ... | 38.3 | NaN | 6.630 | 79.38 | 0.900 | 1.049399e+07 | NaN | NaN | NaN | NaN |
| 3 | 110009 | GRC | Europe | Greece | 2023-07-06 | NaN | NaN | NaN | NaN | NaN | ... | 52.0 | NaN | 4.210 | 82.24 | 0.888 | 1.038497e+07 | NaN | NaN | NaN | NaN |
| 4 | 130446 | IND | Asia | India | 2023-07-06 | NaN | NaN | NaN | NaN | NaN | ... | 20.6 | 59.550 | 0.530 | 69.66 | 0.645 | 1.417173e+09 | NaN | NaN | NaN | NaN |
| 5 | 152207 | KGZ | Asia | Kyrgyzstan | 2023-07-06 | NaN | NaN | NaN | NaN | NaN | ... | 50.5 | 89.220 | 4.500 | 71.45 | 0.697 | 6.630621e+06 | NaN | NaN | NaN | NaN |
| 6 | 170924 | MYS | Asia | Malaysia | 2023-07-06 | NaN | NaN | NaN | NaN | NaN | ... | 42.4 | NaN | 1.900 | 76.16 | 0.810 | 3.393822e+07 | NaN | NaN | NaN | NaN |
| 7 | 199087 | NLD | Europe | Netherlands | 2023-07-06 | NaN | NaN | NaN | NaN | NaN | ... | 27.3 | NaN | 3.320 | 82.28 | 0.944 | 1.756402e+07 | NaN | NaN | NaN | NaN |
| 8 | 279000 | SWE | Europe | Sweden | 2023-07-06 | NaN | NaN | NaN | NaN | NaN | ... | 18.9 | NaN | 2.220 | 82.80 | 0.945 | 1.054935e+07 | NaN | NaN | NaN | NaN |
| 9 | 309706 | URY | South America | Uruguay | 2023-07-06 | NaN | NaN | NaN | NaN | NaN | ... | 19.9 | NaN | 2.800 | 77.91 | 0.817 | 3.422796e+06 | NaN | NaN | NaN | NaN |
10 rows × 68 columns
There seems to be no useful data for the most recent date in the dataset. Try finding the most recent date with no NAN values for total cases
df_most_recent_useful = df[df["total_cases"].notnull()].reset_index()
df_most_recent_useful
| index | iso_code | continent | location | date | total_cases | new_cases | new_cases_smoothed | total_deaths | new_deaths | ... | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | population | excess_mortality_cumulative_absolute | excess_mortality_cumulative | excess_mortality | excess_mortality_cumulative_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 54 | AFG | Asia | Afghanistan | 2020-02-26 | 1.0 | 1.0 | 0.143 | NaN | 0.0 | ... | NaN | 37.746 | 0.5 | 64.83 | 0.511 | 41128772.0 | NaN | NaN | NaN | NaN |
| 1 | 55 | AFG | Asia | Afghanistan | 2020-02-27 | 1.0 | 0.0 | 0.143 | NaN | 0.0 | ... | NaN | 37.746 | 0.5 | 64.83 | 0.511 | 41128772.0 | NaN | NaN | NaN | NaN |
| 2 | 56 | AFG | Asia | Afghanistan | 2020-02-28 | 1.0 | 0.0 | 0.143 | NaN | 0.0 | ... | NaN | 37.746 | 0.5 | 64.83 | 0.511 | 41128772.0 | NaN | NaN | NaN | NaN |
| 3 | 57 | AFG | Asia | Afghanistan | 2020-02-29 | 1.0 | 0.0 | 0.143 | NaN | 0.0 | ... | NaN | 37.746 | 0.5 | 64.83 | 0.511 | 41128772.0 | NaN | NaN | NaN | NaN |
| 4 | 58 | AFG | Asia | Afghanistan | 2020-03-01 | 1.0 | 0.0 | 0.143 | NaN | 0.0 | ... | NaN | 37.746 | 0.5 | 64.83 | 0.511 | 41128772.0 | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 271402 | 323695 | ZWE | Africa | Zimbabwe | 2023-07-01 | 265524.0 | 0.0 | 15.857 | 5707.0 | 0.0 | ... | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
| 271403 | 323696 | ZWE | Africa | Zimbabwe | 2023-07-02 | 265524.0 | 0.0 | 15.857 | 5707.0 | 0.0 | ... | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
| 271404 | 323697 | ZWE | Africa | Zimbabwe | 2023-07-03 | 265604.0 | 80.0 | 11.429 | 5709.0 | 2.0 | ... | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
| 271405 | 323698 | ZWE | Africa | Zimbabwe | 2023-07-04 | 265604.0 | 0.0 | 11.429 | 5709.0 | 0.0 | ... | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
| 271406 | 323699 | ZWE | Africa | Zimbabwe | 2023-07-05 | 265604.0 | 0.0 | 11.429 | 5709.0 | 0.0 | ... | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
271407 rows × 68 columns
Seems like 2023-06-21 is the most recent date with a value for total_cases. We will use this date as the most recent date.
df_most_recent = df[df["date"] == "2023-06-21"].reset_index()
df_most_recent
| index | iso_code | continent | location | date | total_cases | new_cases | new_cases_smoothed | total_deaths | new_deaths | ... | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | population | excess_mortality_cumulative_absolute | excess_mortality_cumulative | excess_mortality | excess_mortality_cumulative_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1265 | AFG | Asia | Afghanistan | 2023-06-21 | 223059.0 | 38.0 | 40.714 | 7924.0 | 0.0 | ... | NaN | 37.746 | 0.50 | 64.83 | 0.511 | 41128772.0 | NaN | NaN | NaN | NaN |
| 1 | 3825 | ALB | Europe | Albania | 2023-06-21 | 334090.0 | 0.0 | 0.000 | 3604.0 | 0.0 | ... | 51.2 | NaN | 2.89 | 78.57 | 0.795 | 2842318.0 | NaN | NaN | NaN | NaN |
| 2 | 5105 | DZA | Africa | Algeria | 2023-06-21 | 271847.0 | 0.0 | 0.000 | 6881.0 | 0.0 | ... | 30.4 | 83.741 | 1.90 | 76.88 | 0.748 | 44903228.0 | NaN | NaN | NaN | NaN |
| 3 | 6385 | ASM | Oceania | American Samoa | 2023-06-21 | 8332.0 | 0.0 | 0.000 | 34.0 | 0.0 | ... | NaN | NaN | NaN | 73.74 | NaN | 44295.0 | NaN | NaN | NaN | NaN |
| 4 | 7665 | AND | Europe | Andorra | 2023-06-21 | 48015.0 | 0.0 | 0.000 | 159.0 | 0.0 | ... | 37.8 | NaN | NaN | 83.73 | 0.868 | 79843.0 | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 234 | 317283 | OWID_WLS | Europe | Wales | 2023-06-21 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | 3170000.0 | NaN | NaN | NaN | NaN |
| 235 | 318563 | WLF | Oceania | Wallis and Futuna | 2023-06-21 | 3550.0 | 0.0 | 0.000 | 8.0 | 0.0 | ... | NaN | NaN | NaN | 79.94 | NaN | 11596.0 | NaN | NaN | NaN | NaN |
| 236 | 321125 | YEM | Asia | Yemen | 2023-06-21 | 11945.0 | 0.0 | 0.000 | 2159.0 | 0.0 | ... | 29.2 | 49.542 | 0.70 | 66.12 | 0.470 | 33696612.0 | NaN | NaN | NaN | NaN |
| 237 | 322405 | ZMB | Africa | Zambia | 2023-06-21 | 345961.0 | 0.0 | 129.000 | 4060.0 | 0.0 | ... | 24.7 | 13.938 | 2.00 | 63.89 | 0.584 | 20017670.0 | NaN | NaN | NaN | NaN |
| 238 | 323685 | ZWE | Africa | Zimbabwe | 2023-06-21 | 265413.0 | 0.0 | 17.714 | 5707.0 | 0.0 | ... | 30.7 | 36.791 | 1.70 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
239 rows × 68 columns
# Need to delete the World and continent summary figures
df_most_recent
| index | iso_code | continent | location | date | total_cases | new_cases | new_cases_smoothed | total_deaths | new_deaths | ... | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | population | excess_mortality_cumulative_absolute | excess_mortality_cumulative | excess_mortality | excess_mortality_cumulative_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1265 | AFG | Asia | Afghanistan | 2023-06-21 | 223059.0 | 38.0 | 40.714 | 7924.0 | 0.0 | ... | NaN | 37.746 | 0.50 | 64.83 | 0.511 | 41128772.0 | NaN | NaN | NaN | NaN |
| 1 | 3825 | ALB | Europe | Albania | 2023-06-21 | 334090.0 | 0.0 | 0.000 | 3604.0 | 0.0 | ... | 51.2 | NaN | 2.89 | 78.57 | 0.795 | 2842318.0 | NaN | NaN | NaN | NaN |
| 2 | 5105 | DZA | Africa | Algeria | 2023-06-21 | 271847.0 | 0.0 | 0.000 | 6881.0 | 0.0 | ... | 30.4 | 83.741 | 1.90 | 76.88 | 0.748 | 44903228.0 | NaN | NaN | NaN | NaN |
| 3 | 6385 | ASM | Oceania | American Samoa | 2023-06-21 | 8332.0 | 0.0 | 0.000 | 34.0 | 0.0 | ... | NaN | NaN | NaN | 73.74 | NaN | 44295.0 | NaN | NaN | NaN | NaN |
| 4 | 7665 | AND | Europe | Andorra | 2023-06-21 | 48015.0 | 0.0 | 0.000 | 159.0 | 0.0 | ... | 37.8 | NaN | NaN | 83.73 | 0.868 | 79843.0 | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 234 | 317283 | OWID_WLS | Europe | Wales | 2023-06-21 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | 3170000.0 | NaN | NaN | NaN | NaN |
| 235 | 318563 | WLF | Oceania | Wallis and Futuna | 2023-06-21 | 3550.0 | 0.0 | 0.000 | 8.0 | 0.0 | ... | NaN | NaN | NaN | 79.94 | NaN | 11596.0 | NaN | NaN | NaN | NaN |
| 236 | 321125 | YEM | Asia | Yemen | 2023-06-21 | 11945.0 | 0.0 | 0.000 | 2159.0 | 0.0 | ... | 29.2 | 49.542 | 0.70 | 66.12 | 0.470 | 33696612.0 | NaN | NaN | NaN | NaN |
| 237 | 322405 | ZMB | Africa | Zambia | 2023-06-21 | 345961.0 | 0.0 | 129.000 | 4060.0 | 0.0 | ... | 24.7 | 13.938 | 2.00 | 63.89 | 0.584 | 20017670.0 | NaN | NaN | NaN | NaN |
| 238 | 323685 | ZWE | Africa | Zimbabwe | 2023-06-21 | 265413.0 | 0.0 | 17.714 | 5707.0 | 0.0 | ... | 30.7 | 36.791 | 1.70 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
239 rows × 68 columns
df_world = df_most_recent.groupby("date")[["total_cases", "new_cases", "total_deaths"]].sum().reset_index()
df_world
| date | total_cases | new_cases | total_deaths | |
|---|---|---|---|---|
| 0 | 2023-06-21 | 767352794.0 | 3579.0 | 6946651.0 |
# Create Column headers/labels
labels = ["Last Update", "Total Confirmed", "New Cases", "Total Deaths"]
fig = go.Figure(data=[go.Table(header = dict(values=labels),
cells = dict(values=df_world.loc[0, ["date", "total_cases", "new_cases", "total_deaths"]]))])
fig.update_layout(title="Covid-19 World Summary: ")
fig.show()
df_over_time = df.groupby("date")[["total_cases", "new_cases", "total_deaths"]].sum().reset_index().sort_values("date", ascending=True).reset_index(drop=True)
df_over_time
| date | total_cases | new_cases | total_deaths | |
|---|---|---|---|---|
| 0 | 2020-01-01 | 0.0 | 0.0 | 0.0 |
| 1 | 2020-01-02 | 0.0 | 0.0 | 0.0 |
| 2 | 2020-01-03 | 0.0 | 0.0 | 0.0 |
| 3 | 2020-01-04 | 3.0 | 3.0 | 0.0 |
| 4 | 2020-01-05 | 3.0 | 0.0 | 3.0 |
| ... | ... | ... | ... | ... |
| 1278 | 2023-07-02 | 767582199.0 | 18639.0 | 6948254.0 |
| 1279 | 2023-07-03 | 767725633.0 | 143434.0 | 6948743.0 |
| 1280 | 2023-07-04 | 767726097.0 | 464.0 | 6948751.0 |
| 1281 | 2023-07-05 | 767726097.0 | 0.0 | 6948751.0 |
| 1282 | 2023-07-06 | 0.0 | 0.0 | 0.0 |
1283 rows × 4 columns
We should remove the most recent dates until 2023-06-21 as they hold no useful data. First we should convert the dates to Timestamp objects so they are easier to work with
df_over_time["date"] = pd.to_datetime(df_over_time["date"])
df_over_time = df_over_time[df_over_time["date"] <= "2023-06-21"]
df_over_time
| date | total_cases | new_cases | total_deaths | |
|---|---|---|---|---|
| 0 | 2020-01-01 | 0.0 | 0.0 | 0.0 |
| 1 | 2020-01-02 | 0.0 | 0.0 | 0.0 |
| 2 | 2020-01-03 | 0.0 | 0.0 | 0.0 |
| 3 | 2020-01-04 | 3.0 | 3.0 | 0.0 |
| 4 | 2020-01-05 | 3.0 | 0.0 | 3.0 |
| ... | ... | ... | ... | ... |
| 1263 | 2023-06-17 | 767123321.0 | 6852.0 | 6945595.0 |
| 1264 | 2023-06-18 | 767176663.0 | 53342.0 | 6946130.0 |
| 1265 | 2023-06-19 | 767346813.0 | 170150.0 | 6946605.0 |
| 1266 | 2023-06-20 | 767349215.0 | 2402.0 | 6946626.0 |
| 1267 | 2023-06-21 | 767352794.0 | 3579.0 | 6946651.0 |
1268 rows × 4 columns
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_over_time["date"], y=df_over_time["total_cases"],
mode = "lines",
name = "Confirmed Cases"))
fig.update_layout(
title="Evolution of Confirmed Global Covid-19 Cases Over Time",
template="plotly_white",
yaxis_title="Confirmed Cases",
xaxis_title="Date",
)
fig.update_xaxes(
dtick="M1",
tickformat="%b-%Y",
)
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_over_time["date"], y=df_over_time["total_deaths"],
mode="lines", marker_color="red",
name="Total Deaths", line=dict(dash="dot")))
fig.update_layout(
title="Evolution of Global Covid-19 Deaths Over Time",
template="plotly_dark",
yaxis_title="Number of Deaths",
xaxis_title="Date",
)
fig.update_xaxes(
dtick="M1",
tickformat="%b-%Y",
)
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_over_time["date"], y=df_over_time["new_cases"],
mode="lines", marker_color="orange",
name="Total Deaths",))
fig.update_layout(
title="Evolution of Global Covid-19 New Cases Over Time",
template="plotly_dark",
yaxis_title="Number of New Cases",
xaxis_title="Date",
width=2000,
height=600
)
fig.update_xaxes(
dtick="M1",
tickformat="%b-%Y",
)
fig.show()
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 308332 entries, 0 to 323699 Data columns (total 67 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 iso_code 308332 non-null object 1 continent 308332 non-null object 2 location 308332 non-null object 3 date 308332 non-null object 4 total_cases 271407 non-null float64 5 new_cases 299422 non-null float64 6 new_cases_smoothed 298223 non-null float64 7 total_deaths 250519 non-null float64 8 new_deaths 299464 non-null float64 9 new_deaths_smoothed 298294 non-null float64 10 total_cases_per_million 271407 non-null float64 11 new_cases_per_million 299422 non-null float64 12 new_cases_smoothed_per_million 298223 non-null float64 13 total_deaths_per_million 250519 non-null float64 14 new_deaths_per_million 299464 non-null float64 15 new_deaths_smoothed_per_million 298294 non-null float64 16 reproduction_rate 183741 non-null float64 17 icu_patients 36791 non-null float64 18 icu_patients_per_million 36791 non-null float64 19 hosp_patients 37673 non-null float64 20 hosp_patients_per_million 37673 non-null float64 21 weekly_icu_admissions 9721 non-null float64 22 weekly_icu_admissions_per_million 9721 non-null float64 23 weekly_hosp_admissions 22425 non-null float64 24 weekly_hosp_admissions_per_million 22425 non-null float64 25 total_tests 79387 non-null float64 26 new_tests 75403 non-null float64 27 total_tests_per_thousand 79387 non-null float64 28 new_tests_per_thousand 75403 non-null float64 29 new_tests_smoothed 103965 non-null float64 30 new_tests_smoothed_per_thousand 103965 non-null float64 31 positive_rate 95927 non-null float64 32 tests_per_case 94348 non-null float64 33 tests_units 106788 non-null object 34 total_vaccinations 65585 non-null float64 35 people_vaccinated 62338 non-null float64 36 people_fully_vaccinated 59049 non-null float64 37 total_boosters 35211 non-null float64 38 new_vaccinations 52092 non-null float64 39 new_vaccinations_smoothed 161954 non-null float64 40 total_vaccinations_per_hundred 65585 non-null float64 41 people_vaccinated_per_hundred 62338 non-null float64 42 people_fully_vaccinated_per_hundred 59049 non-null float64 43 total_boosters_per_hundred 35211 non-null float64 44 new_vaccinations_smoothed_per_million 161954 non-null float64 45 new_people_vaccinated_smoothed 161745 non-null float64 46 new_people_vaccinated_smoothed_per_hundred 161745 non-null float64 47 stringency_index 197651 non-null float64 48 population_density 273404 non-null float64 49 median_age 254189 non-null float64 50 aged_65_older 245244 non-null float64 51 aged_70_older 251629 non-null float64 52 gdp_per_capita 249084 non-null float64 53 extreme_poverty 160012 non-null float64 54 cardiovasc_death_rate 249598 non-null float64 55 diabetes_prevalence 262369 non-null float64 56 female_smokers 186894 non-null float64 57 male_smokers 184334 non-null float64 58 handwashing_facilities 121605 non-null float64 59 hospital_beds_per_thousand 220174 non-null float64 60 life_expectancy 296429 non-null float64 61 human_development_index 241889 non-null float64 62 population 308332 non-null float64 63 excess_mortality_cumulative_absolute 11245 non-null float64 64 excess_mortality_cumulative 11245 non-null float64 65 excess_mortality 11245 non-null float64 66 excess_mortality_cumulative_per_million 11245 non-null float64 dtypes: float64(62), object(5) memory usage: 160.0+ MB
df["date"] = pd.to_datetime(df["date"])
df_per_country = df.groupby("location")[["new_cases", "new_deaths"]].sum().reset_index().sort_values("new_cases", ascending=False).reset_index(drop=True)
df_per_country
| location | new_cases | new_deaths | |
|---|---|---|---|
| 0 | United States | 103436829.0 | 1129589.0 |
| 1 | China | 99292081.0 | 121490.0 |
| 2 | India | 44995105.0 | 531908.0 |
| 3 | France | 38989382.0 | 167923.0 |
| 4 | Germany | 38435774.0 | 174807.0 |
| ... | ... | ... | ... |
| 238 | Turkmenistan | 0.0 | 0.0 |
| 239 | Western Sahara | 0.0 | 0.0 |
| 240 | Taiwan | 0.0 | 0.0 |
| 241 | North Korea | 0.0 | 0.0 |
| 242 | Northern Ireland | 0.0 | 0.0 |
243 rows × 3 columns
df_per_country.columns = ["Country", "Total Cases", "Total Deaths"]
df_per_country = df_per_country.astype({"Country":str, "Total Cases":int, "Total Deaths":int})
df_per_country.head(10)
| Country | Total Cases | Total Deaths | |
|---|---|---|---|
| 0 | United States | 103436829 | 1129589 |
| 1 | China | 99292081 | 121490 |
| 2 | India | 44995105 | 531908 |
| 3 | France | 38989382 | 167923 |
| 4 | Germany | 38435774 | 174807 |
| 5 | Brazil | 37671993 | 703964 |
| 6 | Japan | 33803572 | 74708 |
| 7 | South Korea | 32256154 | 35078 |
| 8 | Italy | 25897801 | 190868 |
| 9 | United Kingdom | 24639160 | 227739 |
fig = go.Figure(go.Bar(
x=df_per_country["Total Cases"],
y=df_per_country["Country"],
orientation="h"))
fig.update_layout(
title="Confirmed Cases in Each Country",
template="plotly_white",
xaxis_title="Confirmed Cases",
yaxis_title="Country",
)
fig.show()
fig = go.Figure(go.Bar(
x=df_per_country["Total Deaths"],
y=df_per_country["Country"],
orientation="h",
marker_color="black"))
fig.update_layout(
title="Confirmed Deaths in Each Country ",
template="plotly_white",
xaxis_title="Confirmed Deaths",
yaxis_title="Country",
)
fig.show()
Let's look at the total case numbers on a world map
fig = px.choropleth(df_per_country, locations=df_per_country["Country"],
color=df_per_country["Total Cases"], locationmode="country names",
hover_name=df_per_country["Country"],
color_continuous_scale=px.colors.sequential.OrRd,
template="plotly_dark", )
fig.update_layout(
title="Confirmed Cases in Each Country",
margin=dict(l=0, r=0, t=40, b=0)
)
fig.show()
Let's group by country and get the total number of cases and deaths per million
df_per_country_pm = df.groupby("location")[["new_cases_per_million", "new_deaths_per_million"]].sum().reset_index().sort_values("new_cases_per_million", ascending=False).reset_index(drop=True)
df_per_country_pm.columns = ["Country", "Total Cases PM", "Total Deaths PM"]
df_per_country_pm = df_per_country_pm.astype({"Country":str, "Total Cases PM":int, "Total Deaths PM":int})
df_per_country_pm.head(10)
| Country | Total Cases PM | Total Deaths PM | |
|---|---|---|---|
| 0 | Cyprus | 737554 | 1522 |
| 1 | San Marino | 722054 | 3710 |
| 2 | Brunei | 687696 | 358 |
| 3 | Austria | 680262 | 2520 |
| 4 | Faeroe Islands | 652484 | 527 |
| 5 | Slovenia | 634272 | 4439 |
| 6 | Gibraltar | 628882 | 3458 |
| 7 | Martinique | 626793 | 3003 |
| 8 | South Korea | 622515 | 676 |
| 9 | France | 603302 | 2598 |
fig = px.choropleth(df_per_country_pm, locations=df_per_country_pm["Country"],
color=df_per_country_pm["Total Cases PM"], locationmode="country names",
hover_name=df_per_country_pm["Country"],
color_continuous_scale=px.colors.sequential.Viridis,
template="plotly_dark", )
fig.update_layout(
title="Confirmed Cases Per Million in Each Country",
margin=dict(l=0, r=0, t=40, b=0)
)
fig.show()
fig = go.Figure(data=[go.Scatter(
x=df_per_country_pm["Country"][0:10],
y=df_per_country_pm["Total Cases PM"][0:10],
mode="markers",
marker=dict(
color=100+np.random.randn(500),
size=df_per_country_pm["Total Cases PM"][0:10]/10000,
showscale=True
)
)])
fig.update_layout(
title="Top 10 Most Infected Countries Per Million",
xaxis_title="Country",
yaxis_title="Total Cases PM",
template="plotly_white",
width=800
)
fig.show()
df.tail()
| iso_code | continent | location | date | total_cases | new_cases | new_cases_smoothed | total_deaths | new_deaths | new_deaths_smoothed | ... | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | population | excess_mortality_cumulative_absolute | excess_mortality_cumulative | excess_mortality | excess_mortality_cumulative_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 323695 | ZWE | Africa | Zimbabwe | 2023-07-01 | 265524.0 | 0.0 | 15.857 | 5707.0 | 0.0 | 0.000 | ... | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
| 323696 | ZWE | Africa | Zimbabwe | 2023-07-02 | 265524.0 | 0.0 | 15.857 | 5707.0 | 0.0 | 0.000 | ... | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
| 323697 | ZWE | Africa | Zimbabwe | 2023-07-03 | 265604.0 | 80.0 | 11.429 | 5709.0 | 2.0 | 0.286 | ... | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
| 323698 | ZWE | Africa | Zimbabwe | 2023-07-04 | 265604.0 | 0.0 | 11.429 | 5709.0 | 0.0 | 0.286 | ... | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
| 323699 | ZWE | Africa | Zimbabwe | 2023-07-05 | 265604.0 | 0.0 | 11.429 | 5709.0 | 0.0 | 0.286 | ... | 30.7 | 36.791 | 1.7 | 61.49 | 0.571 | 16320539.0 | NaN | NaN | NaN | NaN |
5 rows × 67 columns
df_sorted = df.copy()
df_sorted = df.groupby(["date", "location",])[["total_cases"]].sum().reset_index().sort_values("date", ascending=True).reset_index(drop=True)
df_sorted = df_sorted[df_sorted["date"] <= "2023-06-21"] # Excluding very recent dates with no useful data
# Need to convert timestamp dates back to string form as timestamp objects are not compatible with plotly interactive map
df_sorted["date"] = df_sorted["date"].astype(str)
fig = px.choropleth(df_sorted, locations=df_sorted["location"],
color=df_sorted["total_cases"], locationmode="country names",
hover_name=df_sorted["location"],
color_continuous_scale=px.colors.sequential.Inferno,
animation_frame="date")
fig.update_layout(
title="Evolution of Confirmed Cases in Each Country",
template="plotly_dark",
legend_title_text="Total Cases",
)
fig.show()